library(ggplot2)
library(ggpubr)
## Loading required package: magrittr
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:plyr':
## 
##     here
## The following object is masked from 'package:base':
## 
##     date
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(scales)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine

LOAD DATA

movies <- get(load("movies.RData"))
head(movies)
summary(movies)
##     title                  title_type                 genre    
##  Length:651         Documentary : 55   Drama             :305  
##  Class :character   Feature Film:591   Comedy            : 87  
##  Mode  :character   TV Movie    :  5   Action & Adventure: 65  
##                                        Mystery & Suspense: 59  
##                                        Documentary       : 52  
##                                        Horror            : 23  
##                                        (Other)           : 60  
##     runtime       mpaa_rating                               studio   
##  Min.   : 39.0   G      : 19   Paramount Pictures              : 37  
##  1st Qu.: 92.0   NC-17  :  2   Warner Bros. Pictures           : 30  
##  Median :103.0   PG     :118   Sony Pictures Home Entertainment: 27  
##  Mean   :105.8   PG-13  :133   Universal Pictures              : 23  
##  3rd Qu.:115.8   R      :329   Warner Home Video               : 19  
##  Max.   :267.0   Unrated: 50   (Other)                         :507  
##  NA's   :1                     NA's                            :  8  
##  thtr_rel_year  thtr_rel_month   thtr_rel_day    dvd_rel_year 
##  Min.   :1970   Min.   : 1.00   Min.   : 1.00   Min.   :1991  
##  1st Qu.:1990   1st Qu.: 4.00   1st Qu.: 7.00   1st Qu.:2001  
##  Median :2000   Median : 7.00   Median :15.00   Median :2004  
##  Mean   :1998   Mean   : 6.74   Mean   :14.42   Mean   :2004  
##  3rd Qu.:2007   3rd Qu.:10.00   3rd Qu.:21.00   3rd Qu.:2008  
##  Max.   :2014   Max.   :12.00   Max.   :31.00   Max.   :2015  
##                                                 NA's   :8     
##  dvd_rel_month     dvd_rel_day     imdb_rating    imdb_num_votes  
##  Min.   : 1.000   Min.   : 1.00   Min.   :1.900   Min.   :   180  
##  1st Qu.: 3.000   1st Qu.: 7.00   1st Qu.:5.900   1st Qu.:  4546  
##  Median : 6.000   Median :15.00   Median :6.600   Median : 15116  
##  Mean   : 6.333   Mean   :15.01   Mean   :6.493   Mean   : 57533  
##  3rd Qu.: 9.000   3rd Qu.:23.00   3rd Qu.:7.300   3rd Qu.: 58300  
##  Max.   :12.000   Max.   :31.00   Max.   :9.000   Max.   :893008  
##  NA's   :8        NA's   :8                                       
##          critics_rating critics_score    audience_rating audience_score 
##  Certified Fresh:135    Min.   :  1.00   Spilled:275     Min.   :11.00  
##  Fresh          :209    1st Qu.: 33.00   Upright:376     1st Qu.:46.00  
##  Rotten         :307    Median : 61.00                   Median :65.00  
##                         Mean   : 57.69                   Mean   :62.36  
##                         3rd Qu.: 83.00                   3rd Qu.:80.00  
##                         Max.   :100.00                   Max.   :97.00  
##                                                                         
##  best_pic_nom best_pic_win best_actor_win best_actress_win best_dir_win
##  no :629      no :644      no :558        no :579          no :608     
##  yes: 22      yes:  7      yes: 93        yes: 72          yes: 43     
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##  top200_box   director            actor1             actor2         
##  no :636    Length:651         Length:651         Length:651        
##  yes: 15    Class :character   Class :character   Class :character  
##             Mode  :character   Mode  :character   Mode  :character  
##                                                                     
##                                                                     
##                                                                     
##                                                                     
##     actor3             actor4             actor5         
##  Length:651         Length:651         Length:651        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##    imdb_url            rt_url         
##  Length:651         Length:651        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##                                       
## 

TRANSFORM DATA

#Runtime:
movies[!complete.cases(movies['runtime']),]
movies$runtime[movies$title == "The End of America"] <- 74
#https://www.imdb.com/title/tt1294790/

#Director:
movies$director[movies$title == "Lorenzo's Oil"] <- "George Miller"
#https://es.wikipedia.org/wiki/Lorenzo%27s_Oil_(pel%C3%ADcula)
movies$director[movies$title == "The Ninth Gate"] <- "Roman Polanski"
#https://es.wikipedia.org/wiki/The_Ninth_Gate

#Studio:
movies$studio <- as.character(movies$studio)
movies$studio[movies$title == "Oliver & Company"] <-
  "Walt Disney Pictures"
#https://es.wikipedia.org/wiki/Oliver_y_su_pandilla
movies$studio[movies$title == "Attack of the 50 Foot Woman"] <-
  "Woolner Brothers Pictures Inc."
#https://www.imdb.com/title/tt0051380/
movies$studio[movies$title == "Inbred"] <- "Melanie Light"
#https://www.imdb.com/title/tt1723124/fullcredits
movies$studio[movies$title == "Caveman"] <- "United Artists"
#https://es.wikipedia.org/wiki/El_cavern%C3%ADcola
movies$studio[movies$title == "Dirty Sanchez: The Movie"] <-
  "Vertigo Films"
#https://www.rottentomatoes.com/m/dirty_sanchez
movies$studio[movies$title == "The Man Who Sued God"] <-
  "Australian Film Finance Corporation (AFFC), New South Wales Film & Television Office, Showtime Australia See more"
#https://www.imdb.com/title/tt0268437/
movies$studio[movies$title == "Inserts"] <-
  "Film and General Productions"
#https://www.filmaffinity.com/es/film740308.html
movies$studio <- factor(movies$studio)

#DVD realease date
movies[!complete.cases(movies['dvd_rel_year']), ]
movies$dvd_rel_year[movies$title == "Charlie: The Life and Art of Charles Chaplin"] <-
  2003
movies$dvd_rel_month[movies$title == "Charlie: The Life and Art of Charles Chaplin"] <-
  11
movies$dvd_rel_day[movies$title == "Charlie: The Life and Art of Charles Chaplin"] <-
  5
#https://www.imdb.com/title/tt0379730/releaseinfo
movies$dvd_rel_year[movies$title == "The Squeeze"] <- 2015
movies$dvd_rel_month[movies$title == "The Squeeze"] <- 6
movies$dvd_rel_day[movies$title == "The Squeeze"] <- 9
#https://medium.com/@releasebandyal/producer-michael-doven-announces-release-of-the-squeeze-on-dvd-75a982d0a047
movies$dvd_rel_year[movies$title == "Electric Dreams"] <- 1984
#https://en.wikipedia.org/wiki/Electric_Dreams_(film)
movies$dvd_rel_year[movies$title == "The Last Remake of Beau Geste"] <-
  2010
movies$dvd_rel_month[movies$title == "The Last Remake of Beau Geste"] <-
  1
movies$dvd_rel_day[movies$title == "The Last Remake of Beau Geste"] <-
  11
#https://en.wikipedia.org/wiki/The_Last_Remake_of_Beau_Geste

#Actors:
movies[!complete.cases(movies['actor4']), ]
movies$actor4[movies$title == "Attack of the 50 Foot Woman"] <-
  "Roy Gordon"
#https://www.imdb.com/title/tt0051380/fullcredits/?ref_=tt_ov_st_sm

movies[!complete.cases(movies['actor5']), ]
movies$actor5[movies$title == "Attack of the 50 Foot Woman"] <-
  "George Douglas"
#https://www.imdb.com/title/tt0051380/fullcredits/?ref_=tt_ov_st_sm
movies$actor5[movies$title == "The Illusionist (L'illusionniste)"] <-
  "Eleanor Tomlinson"
#https://en.wikipedia.org/wiki/The_Illusionist_(2006_film)


#Column manipulation
movies$thtr_rel_date <- as.Date(paste0(movies$thtr_rel_year, "-", movies$thtr_rel_month, "-", movies$thtr_rel_day))
movies$dvd_rel_date <- as.Date(paste0(movies$dvd_rel_year, "-", movies$dvd_rel_month, "-", movies$dvd_rel_day))

movies$thtr_rel_decade <- as.numeric(format(movies$thtr_rel_date,"%Y")) - (as.numeric(format(movies$thtr_rel_date,"%Y")) %% 10)
movies$thtr_rel_decade <- as.factor(movies$thtr_rel_decade)

movies$top200_box <- NULL

movies$dvd_rel_day <- NULL
movies$dvd_rel_month <- NULL
movies$dvd_rel_year <- NULL

#movies$thtr_rel_day <- NULL
#movies$thtr_rel_month <- NULL
#movies$thtr_rel_year <- NULL

movies$imdb_url <- NULL
movies$rt_url <- NULL
head(movies)
summary(movies)
##     title                  title_type                 genre    
##  Length:651         Documentary : 55   Drama             :305  
##  Class :character   Feature Film:591   Comedy            : 87  
##  Mode  :character   TV Movie    :  5   Action & Adventure: 65  
##                                        Mystery & Suspense: 59  
##                                        Documentary       : 52  
##                                        Horror            : 23  
##                                        (Other)           : 60  
##     runtime       mpaa_rating                               studio   
##  Min.   : 39.0   G      : 19   Paramount Pictures              : 37  
##  1st Qu.: 92.0   NC-17  :  2   Warner Bros. Pictures           : 30  
##  Median :103.0   PG     :118   Sony Pictures Home Entertainment: 27  
##  Mean   :105.8   PG-13  :133   Universal Pictures              : 23  
##  3rd Qu.:115.5   R      :329   Warner Home Video               : 19  
##  Max.   :267.0   Unrated: 50   (Other)                         :514  
##                                NA's                            :  1  
##  thtr_rel_year  thtr_rel_month   thtr_rel_day    imdb_rating   
##  Min.   :1970   Min.   : 1.00   Min.   : 1.00   Min.   :1.900  
##  1st Qu.:1990   1st Qu.: 4.00   1st Qu.: 7.00   1st Qu.:5.900  
##  Median :2000   Median : 7.00   Median :15.00   Median :6.600  
##  Mean   :1998   Mean   : 6.74   Mean   :14.42   Mean   :6.493  
##  3rd Qu.:2007   3rd Qu.:10.00   3rd Qu.:21.00   3rd Qu.:7.300  
##  Max.   :2014   Max.   :12.00   Max.   :31.00   Max.   :9.000  
##                                                                
##  imdb_num_votes           critics_rating critics_score    audience_rating
##  Min.   :   180   Certified Fresh:135    Min.   :  1.00   Spilled:275    
##  1st Qu.:  4546   Fresh          :209    1st Qu.: 33.00   Upright:376    
##  Median : 15116   Rotten         :307    Median : 61.00                  
##  Mean   : 57533                          Mean   : 57.69                  
##  3rd Qu.: 58300                          3rd Qu.: 83.00                  
##  Max.   :893008                          Max.   :100.00                  
##                                                                          
##  audience_score  best_pic_nom best_pic_win best_actor_win best_actress_win
##  Min.   :11.00   no :629      no :644      no :558        no :579         
##  1st Qu.:46.00   yes: 22      yes:  7      yes: 93        yes: 72         
##  Median :65.00                                                            
##  Mean   :62.36                                                            
##  3rd Qu.:80.00                                                            
##  Max.   :97.00                                                            
##                                                                           
##  best_dir_win   director            actor1             actor2         
##  no :608      Length:651         Length:651         Length:651        
##  yes: 43      Class :character   Class :character   Class :character  
##               Mode  :character   Mode  :character   Mode  :character  
##                                                                       
##                                                                       
##                                                                       
##                                                                       
##     actor3             actor4             actor5         
##  Length:651         Length:651         Length:651        
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##                                                          
##  thtr_rel_date         dvd_rel_date        thtr_rel_decade
##  Min.   :1970-05-20   Min.   :1991-03-28   1970: 51       
##  1st Qu.:1990-12-07   1st Qu.:2001-05-15   1980:103       
##  Median :2000-09-15   Median :2004-03-02   1990:161       
##  Mean   :1998-06-15   Mean   :2004-12-06   2000:234       
##  3rd Qu.:2007-05-17   3rd Qu.:2008-02-19   2010:102       
##  Max.   :2014-12-25   Max.   :2015-06-09                  
##                       NA's   :5

SDA Homework 1.1

We chose the “Films” dataset and we are going to answer the following questions that we thought were interesting.

1. Is there a relationship between the type/genre and score?

2. Comparing different scores (critics/audience/imdb)

  • Is there any difference on the audience/critics score by genre?
  • Is there any difference on the audience/critics score by MPAA?

3. Comparison best picture/oscar wins and score/oscar wins

  • Are oscar-awarded films more liked?

5. Do actors and directors have a higher score once they won? (find the first film and see the tendency, look for sudden change)


1. Is there a relationship between the type/genre and score?

g1 <- ggplot(data=movies, aes(x=audience_score)) + geom_histogram(bins=30, aes(fill = title_type), position = "identity", alpha = 0.7) + scale_color_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + scale_fill_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + theme(legend.position = "none")

g2 <- ggplot(data=movies, aes(x=critics_score)) + geom_histogram(bins=30, aes(fill = title_type), position = "identity", alpha = 0.7) + scale_color_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + scale_fill_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + theme(legend.position = "none")

g3 <- ggplot(data=movies, aes(x=imdb_rating)) + geom_histogram(bins=30, aes(fill = title_type), position = "identity", alpha = 0.7) + scale_color_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0")) + scale_fill_manual(values = c("#00AFBB", "#E7B800", "#0AFBB0"))

ggarrange(
    g1,
    g2,
    g3,
    ncol = 3,
    nrow = 1,
    widths = c(200,200,400),
    labels = c("1", "2", "3")
  )

g1 <- ggplot(data=movies, aes(x=audience_score)) + geom_density(aes(color = title_type), kernel = "gaussian")
g2 <- ggplot(data=movies, aes(x=critics_score)) + geom_density(aes(color = title_type), kernel = "gaussian")
g3 <- ggplot(data=movies, aes(x=imdb_rating)) + geom_density(aes(color = title_type), kernel = "gaussian")

ggarrange(
    g1,
    g2,
    g3,
    ncol = 3,
    nrow = 1,
    widths = c(200,200,400)
  )

ggplot(data=movies, aes(x = reorder(genre, audience_score, median, order=TRUE), y = audience_score, group = genre, fill = genre)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw() + coord_flip()

ggplot(data=movies, aes(x = reorder(genre, critics_score, median, order=TRUE), y = critics_score, group = genre, fill = genre)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw() + coord_flip()

ggplot(data=movies, aes(x = reorder(genre, imdb_rating, median, order=TRUE), y = imdb_rating, group = genre, fill = genre)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw() + coord_flip()

2. Comparing different scores (critics/audience/imdb)

- Is there any difference on the audience/critics score by genre?

- Is there any difference on the audience/critics score by MPAA score?

score = list();

score$genre_score_agg = list(
  "critics" = aggregate(critics_score ~ genre , movies, mean),
  "audience" = aggregate(audience_score ~ genre, movies, mean),
  "imdb" = aggregate(imdb_rating ~ genre, movies, mean)
)


score$genre_metrics_grid = expand.grid(year =sort(unique(movies$genre)),
                                       metrics = c("Critic", "Audience", "IMDB"))

score$genre_metrics_grid$genre_score_agg =c(score$genre_score_agg$critics$critics_score,
                                            score$genre_score_agg$audience$audience_score, 
                                            score$genre_score_agg$imdb$imdb_rating)

score$genre_metrics_grid$genre_score_agg_rescaled = 
  c(
    rescale(score$genre_score_agg$critics$critics_score, from=(c(1,100))), 
    rescale(score$genre_score_agg$audience$audience_score, from=(c(1,100))), 
    rescale(score$genre_score_agg$imdb$imdb_rating, from=(c(1,10))))

ggplot(score$genre_metrics_grid)+
  geom_tile(aes(x=metrics, y=year,fill=genre_score_agg_rescaled))+
  scale_fill_distiller( limits=c(0,1), palette = "PuOr") +
  labs(title = "Comparision of scores across genres (Heatmap)", x = "", y="", fill="")+
  theme(plot.title = element_text(hjust = 0.5))

To compare the three different score types: Critic, Audience and IMDB to be comaral they must be rescaled as while Critic and Audience are of the range 1-100, IMDB is 1-10. After rescaling, to the range 0-1, a visual comparison of the different groups can be seen by arranging them on a heatmap.

The most striking feature is the lack of high scores for most genres, except for Documentary and Musical & Performing Arts. From the three groups Critics seem to be the most negative in their scoring, while Audience and IMDB to be more aligned. This clearly seen in Action & Adventure genre where Critics have a far lower score than the other two groups. This may be due to Critics having a different criteria on what constitutes a “good” movie but without details on the required metrics this is only speculation. Conversely though IMBD scores are most unaligned with the other groups for the two highest scoring genres, the reasoning for this can not be determined from the data available.

mean_score <-
  data.frame(
    "Score" = c(
      mean(movies$imdb_rating)*10,
      mean(movies$audience_score),
      mean(movies$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_all <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(), axis.text.y = element_blank()) + ggtitle("All genres")

#Action and adventure
action_adventure <- subset(movies, genre == "Action & Adventure")
mean_score <-
  data.frame(
    "Score" = c(
      mean(action_adventure$imdb_rating)*10,
      mean(action_adventure$audience_score),
      mean(action_adventure$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_action <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Action & Adventure")

#Animation
animation <- subset(movies, genre == "Animation")
mean_score <-
  data.frame(
    "Score" = c(
      mean(animation$imdb_rating)*10,
      mean(animation$audience_score),
      mean(animation$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_animation <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Animation")

#Art House & International
art_house <- subset(movies, genre == "Art House & International")

mean_score <-
  data.frame(
    "Score" = c(
      mean(art_house$imdb_rating)*10,
      mean(art_house$audience_score),
      mean(art_house$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_art_house <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.text.y = element_blank()) + ggtitle("Art House & International")

#Comedy
comedy <- subset(movies, genre == "Comedy")

mean_score <-
  data.frame(
    "Score" = c(
      mean(comedy$imdb_rating)*10,
      mean(comedy$audience_score),
      mean(comedy$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )
plot_comedy <-ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Comedy")

#Documentary
documentary <- subset(movies, genre == "Documentary")

mean_score <-
  data.frame(
    "Score" = c(
      mean(documentary$imdb_rating)*10,
      mean(documentary$audience_score),
      mean(documentary$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_documentary <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Documentary")

#Drama
drama <- subset(movies, genre == "Drama")

mean_score <-
  data.frame(
    "Score" = c(
      mean(drama$imdb_rating)*10,
      mean(drama$audience_score),
      mean(drama$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_drama <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.text.y = element_blank()) + ggtitle("Drama")

#Horror
horror <- subset(movies, genre == "Horror")

mean_score <-
  data.frame(
    "Score" = c(
      mean(horror$imdb_rating)*10,
      mean(horror$audience_score),
      mean(horror$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_horror <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Horror")

#Musical & Performing Arts
musical <- subset(movies, genre == "Musical & Performing Arts")

mean_score <-
  data.frame(
    "Score" = c(
      mean(musical$imdb_rating)*10,
      mean(musical$audience_score),
      mean(musical$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_musical <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.text.y = element_blank()) + ggtitle("Musical & Performing Arts")

#Mystery & Suspense
mistery <- subset(movies, genre == "Mystery & Suspense")
mean_score <-
  data.frame(
    "Score" = c(
      mean(mistery$imdb_rating)*10,
      mean(mistery$audience_score),
      mean(mistery$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_mistery <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Mystery & Suspense")

#Other
other <- subset(movies, genre == "Other")

mean_score <-
  data.frame(
    "Score" = c(
      mean(other$imdb_rating)*10,
      mean(other$audience_score),
      mean(other$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )
plot_other <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Other")

#Science Fiction & Fantasy
science <- subset(movies, genre == "Science Fiction & Fantasy")

mean_score <-
  data.frame(
    "Score" = c(
      mean(science$imdb_rating)*10,
      mean(science$audience_score),
      mean(science$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )
plot_science <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0, 88) + theme(legend.position = "none",axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("Science Fiction & Fantasy")

#Combine the plots...
plot_combined_genre <-ggarrange(
    plot_all,
    plot_action,
    plot_animation,
    plot_art_house,
    plot_comedy,
    plot_documentary,
    plot_drama,
    plot_horror,
    plot_mistery,
    plot_musical,
    plot_other,
    plot_science,
    ncol = 3,
    nrow = 4,common.legend = TRUE, legend="bottom")+
  labs(title = "Comparision of scores across genres (Lollipop chart)", x = "", y="", fill="")+
  theme(plot.title = element_text(hjust = 0.5))


plot_combined_genre

Taking a look at the different lollipop charts, we can see different things:

  • Audience gives a higher score than Critics in 10/11 genres (90.9%). The only exception is the Documentary genre.
  • IMDB gives a higher score than Critics in 9/11 genres (81.8%). The only exception is the Documentary and Musical & Performance genres.
  • IMDB gives a higher score than Audience in 7/11 genres (63.6%).
  • Genres like Documentary and Other receive similar ratings from all IMDB, Critics and Audience, whereas in genres like Comedy or Action & Adventure the ratings differ more.

Taking this into account, we can say that the pattern observed in previously is maintained in the individual genres. Critics give the lowest ratings for almost all genres and IMDB and Audience alternate in giving the highest score. There is a difference in the raitings of the different sources according to the genre and this difference follows a pattern, which is more accentuated in some genres than in others.

mean_score <-
  data.frame(
    "Score" = c(
      mean(movies$imdb_rating)*10,
      mean(movies$audience_score),
      mean(movies$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_mpaa_all <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.x = element_blank(), axis.text.y = element_blank()) + ggtitle("All MPAA ratings")

#G
g <- subset(movies, mpaa_rating == "G")
mean_score <-
  data.frame(
    "Score" = c(
      mean(g$imdb_rating)*10,
      mean(g$audience_score),
      mean(g$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_g <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("G")

#NC-17
nc_17 <- subset(movies, mpaa_rating == "NC-17")
mean_score <-
  data.frame(
    "Score" = c(
      mean(nc_17$imdb_rating)*10,
      mean(nc_17$audience_score),
      mean(nc_17$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_nc_17 <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.x = element_blank(),axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("NC-17")

#PG
pg <- subset(movies, mpaa_rating == "PG")

mean_score <-
  data.frame(
    "Score" = c(
      mean(pg$imdb_rating)*10,
      mean(pg$audience_score),
      mean(pg$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_pg <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.x = element_blank(),axis.text.y = element_blank()) + ggtitle("PG")

#PG-13
pg_13 <- subset(movies, mpaa_rating == "PG-13")

mean_score <-
  data.frame(
    "Score" = c(
      mean(pg_13$imdb_rating)*10,
      mean(pg_13$audience_score),
      mean(pg_13$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )
plot_pg_13 <-ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("PG-13")

#R
r <- subset(movies, mpaa_rating == "R")

mean_score <-
  data.frame(
    "Score" = c(
      mean(r$imdb_rating)*10,
      mean(r$audience_score),
      mean(r$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_r <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.title.y = element_blank(),axis.text.y = element_blank()) + ggtitle("R")

#Unrated
unrated <- subset(movies, mpaa_rating == "Unrated")

mean_score <-
  data.frame(
    "Score" = c(
      mean(unrated$imdb_rating)*10,
      mean(unrated$audience_score),
      mean(unrated$critics_score)
    ),
    "Source" = c("IMDB", "Audience", "Critics")
  )

plot_unrated <- ggplot(mean_score, aes(Score, Source)) +
  geom_segment(aes(x = 0, y = Source, xend = Score, yend = Source,colour=Source) ) +
  geom_point(aes(  colour=Source) ) + xlim(0,84) + theme(legend.position = "none",axis.text.y = element_blank()) + ggtitle("Unrated")


#Combine the plots...
plot_combined_mpaa <-ggarrange(
  plot_mpaa_all,
  plot_g,
  plot_nc_17,
  plot_pg,
  plot_pg_13,
  plot_r,
  plot_unrated,
  ncol = 3,
  nrow = 3,common.legend = TRUE, legend="bottom")+
  labs(title = "Comparision of scores across MPAA ratings (Lollipop chart)", x = "", y="", fill="")+
  theme(plot.title = element_text(hjust = 0.5))


plot_combined_mpaa

If we now make a similar comparisson for the MPAA ratings, we can see the following: * Audience gives a higher score than Critics in 4/6 MPAA ratings (66.6%). The two MPAA ratings where Critics give a higher score are NC-17 and Unrated. * IMDB gives a higher score than Critics in 4/6 MPAA ratings (66.6%). Again, the two exceptions are NC-17 and Unrated. * IMDB tends to give a higher score than Audience. This happens, again, in 4/6 MPAA ratings (66.6%). The MPAA ratings where this does not happen are G and Unrated. * Exceptuating G and PG, the rest of the MPAA ratings receive clearly different, although not highly, scores from the three different sources.

From these findings we can see that overall IMDB and the Audience tend to give more positive reviews than the Critics. However, the scores given by the three sources are not that different one from the other in the majority of the MPAA ratings (the only exceptions being the extreme Critics score for NC-17 and PG-13). Therefore, there is not enough evidence to say that there is a significant difference in the score according to the MPAA rating.

3. Comparison best picture/oscar nominated and score/oscar nominated

- Are oscar-awarded films more liked?

ggplot(data=movies, aes(x = best_pic_nom, y = audience_score, group = best_pic_nom, fill = best_pic_nom)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()

ggplot(data=movies, aes(x = best_actor_win, y = audience_score, group = best_actor_win, fill = best_actor_win)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()

ggplot(data=movies, aes(x = best_actress_win, y = audience_score, group = best_actress_win, fill = best_actress_win)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()

ggplot(data=movies, aes(x = best_dir_win, y = audience_score, group = best_dir_win, fill = best_dir_win)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()

ggplot(data=movies, aes(x = best_pic_nom, y = audience_score, group = best_pic_nom, fill = best_pic_nom)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()

ggplot(data=movies, aes(x = best_pic_nom, y = critics_score, group = best_pic_nom, fill = best_pic_nom)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()

ggplot(data=movies, aes(x = best_pic_nom, y = imdb_rating, group = best_pic_nom, fill = best_pic_nom)) + geom_boxplot(alpha = .7) + geom_jitter(width = .05, alpha = .4) + guides(fill = "none") + theme_bw()

ggplot(aes(x=best_pic_nom), data=movies[movies$best_pic_win=="yes",]) + geom_bar()

movies[movies$best_pic_nom=="no" & movies$best_pic_win=="yes",]
movies[movies$best_pic_nom=="no" & movies$best_pic_win=="yes",]$best_pic_nom <- "yes"
ggplot(aes(x=best_actor_win), data=movies[movies$best_pic_nom=="yes",]) + geom_bar()

ggplot(aes(x=best_actress_win), data=movies[movies$best_pic_nom=="yes",]) + geom_bar()

ggplot(aes(x=best_dir_win), data=movies[movies$best_pic_nom=="yes",]) + geom_bar()

 best_pic_movies <- subset(movies, c(best_pic_win == "yes"))

 #Add a column showing if they have won another award
 best_pic_movies$other_win <- NA

 if (best_pic_movies$best_actor_win == "yes" ||
     best_pic_movies$best_actress_win == "yes" ||
     best_pic_movies$best_dir_win == "yes") {
   best_pic_movies$other_win <- "yes"
 } else{
    best_pic_movies$other_win <- "no"
 }

 #Plots...
 #Best movies that have won another award
 best_movie_another_award <- plyr::count(best_pic_movies$other_win)
 best_movie_another_award <- best_movie_another_award %>%
   arrange((freq)) %>%
   mutate(lab.ypos = cumsum(freq) - 0.5 * freq)

 pie_another_award <-
   ggplot(best_movie_another_award, aes(x = "", y = freq, fill = x)) +
   geom_bar(width = 1,
            stat = "identity",
            color = "white") +
   coord_polar("y", start = 0) +
   theme_void() +
   geom_text(aes(label = signif((freq / 7) * 100), digits = 2),
             position = position_stack(vjust = 0.5),
             color = "white") +
   labs(fill = "Another award") +
   ggtitle("Best movies w/ awarded member") +
   theme(plot.title = element_text(size = 12, hjust = 0.5)) +
   scale_fill_manual(values = "#00BFC4")
## Warning: Ignoring unknown aesthetics: digits
 #Best movies that have won best actor award
 best_movie_best_actor <- plyr::count(best_pic_movies$best_actor_win)
 best_movie_best_actor <- best_movie_best_actor %>%
   arrange((freq)) %>%
   mutate(lab.ypos = cumsum(freq) - 0.5 * freq)

 pie_best_actor <-
   ggplot(best_movie_best_actor, aes(x = "", y = freq, fill = x)) +
   geom_bar(width = 1,
            stat = "identity",
            color = "white") +
   coord_polar("y", start = 0) +
   theme_void() +
   geom_text(aes(label = signif((freq / 7) * 100), digits = 2),
             position = position_stack(vjust = 0.5),
             color = "white") +
   labs(fill = "Best actor") +
   ggtitle("Best movies w/ best actor") +
   theme(plot.title = element_text(size = 12, hjust = 0.5))
## Warning: Ignoring unknown aesthetics: digits
 #Best movies that have won best actress award
 best_movie_best_actress <- plyr::count(best_pic_movies$best_actress_win)
 best_movie_best_actress <- best_movie_best_actress %>%
   arrange((freq)) %>%
   mutate(lab.ypos = cumsum(freq) - 1 * freq)

 pie_best_actress <-
   ggplot(best_movie_best_actress, aes(x = "", y = freq, fill = x)) +
   geom_bar(width = 1,
            stat = "identity",
            color = "white") +
   coord_polar("y", start = 0) +
   theme_void() +
   geom_text(aes(label = signif((freq / 7) * 100), digits = 2),
             position = position_stack(vjust = 0.5),
             color = "white") +
   labs(fill = "Best actress") +
   ggtitle("Best movies w/ best actress") +
   theme(plot.title = element_text(size = 12, hjust = 0.5))
## Warning: Ignoring unknown aesthetics: digits
 #Best movies that have won best director award
 best_movie_best_director <- plyr::count(best_pic_movies$best_dir_win)
 best_movie_best_director <- best_movie_best_director %>%
   arrange((freq)) %>%
   mutate(lab.ypos = cumsum(freq) - 0.9 * freq)

 pie_best_director <-
   ggplot(best_movie_best_director, aes(x = "", y = freq, fill = x)) +
   geom_bar(width = 1,
            stat = "identity",
            color = "white") +
   coord_polar("y", start = 0) +
   theme_void() +
   geom_text(aes(label = signif((freq / 7) * 100), digits = 2),
             position = position_stack(vjust = 0.5),
             color = "white") +
   labs(fill = "Best director") +
   ggtitle("Best movies w/ best director") +
   theme(plot.title = element_text(size = 12, hjust = 0.5))
## Warning: Ignoring unknown aesthetics: digits
 #Combine plots!
 plot_combined_awarded <-
   ggarrange(
     pie_another_award,
     pie_best_actor,
    pie_best_actress,
    pie_best_director,
    ncol = 2,
    nrow = 2,
    widths = c(400,400,400,400)
  )+
  labs(title = "Proportion of awards in the best movies (Pie chart)", x = "", y="", fill="")+
  theme(plot.title = element_text(hjust = 0.5))

plot_combined_awarded

Looking at the pie charts, we can see that the movies in the dataset that have received the award for best film always have a cast/crew member with another award. It must be taken into account that the actors, actresses and directors could have received their award at any point in their career, not necessarily for the awarded film in which they participate. That is, the pie charts are not meant to show that if a cast/crew member receives an award for one film, that film is more likely to be awarded itself. However, the plots can serve to study the relationship between having high quality proffesionals working in a movie and that movie receiving an award.

Considering this, we can see that most awarded movies in this dataset have a director that has or will receive an award. The presence of awarded directors in 6/7 movies highly surpasses the proportion of awarded movies with awarded actors and actresses. This hints that the quality of the director is mor influent than that of the actors and actresses in determining if the movie is awarded for best film. This deduction makes sense, since actors are only responsible for playing their roles whereas the director’s decisions involve all the elements of a film.

# -are oscar-awarded films more liked?
#Audience score vs Best_pic_win
plot_audience_vs_best_pic <-
  ggplot(movies,
         aes(x = best_pic_win, y = audience_score, fill = best_pic_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Audience", x = "Best picture award", y = "Score")
#Critics score vs Best_pic_win
plot_critics_vs_best_pic <-
  ggplot(movies,
         aes(x = best_pic_win, y = critics_score, fill = best_pic_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Critics", x = "Best picture award", y = "Score")
#IMDB score vs Best_pic_win
plot_imdb_vs_best_pic <-
  ggplot(movies, aes(x = best_pic_win, y = imdb_rating, fill = best_pic_win)) +
  geom_boxplot() + theme(legend.position = "none") + labs(title = "IMDB", x = "Best picture award", y = "Score")
#Combine the plots...
plot_combined_best_pic <-
  ggarrange(
    plot_audience_vs_best_pic,
    plot_critics_vs_best_pic,
    plot_imdb_vs_best_pic,
    ncol = 3,
    nrow = 1
  )
plot_combined_best_pic

#Audience score vs Best_dir_win
plot_audience_vs_best_dir <-
  ggplot(movies,
         aes(x = best_dir_win, y = audience_score, fill = best_dir_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Audience", x = "Best director award", y = "Score")
#Critics score vs Best_dir_win
plot_critics_vs_best_dir <-
  ggplot(movies,
         aes(x = best_dir_win, y = critics_score, fill = best_dir_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Critics", x = "Best director award", y = "Score")
#IMDB score vs Best_dir_win
plot_imdb_vs_best_dir <-
  ggplot(movies, aes(x = best_dir_win, y = imdb_rating, fill = best_dir_win)) +
  geom_boxplot() + theme(legend.position = "none") + labs(title = "IMDB", x = "Best director award", y = "Score")
#Combine the plots...
plot_combined_best_dir <-
  ggarrange(
    plot_audience_vs_best_dir,
    plot_critics_vs_best_dir,
    plot_imdb_vs_best_dir,
    ncol = 3,
    nrow = 1
  )
plot_combined_best_dir

#Audience score vs Best_actor_win
plot_audience_vs_best_actor <-
  ggplot(movies,
         aes(x = best_actor_win, y = audience_score, fill = best_actor_win)) +
  geom_boxplot() + theme(legend.position = "none") + labs(title = "Audience", x = "Best actor award", y = "Score")
#Critics score vs Best_dir_win
plot_critics_vs_best_actor <-
  ggplot(movies,
         aes(x = best_actor_win, y = critics_score, fill = best_actor_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "Critics", x = "Best actor award", y = "Score")
#IMDB score vs Best_dir_win
plot_imdb_vs_best_actor <-
  ggplot(movies,
         aes(x = best_actor_win, y = imdb_rating, fill = best_actor_win)) + geom_boxplot() + theme(legend.position = "none") + labs(title = "IMDB", x = "Best actor award", y = "Score")
#Combine the plots...
plot_combined_best_actor <-
  ggarrange(
    plot_audience_vs_best_actor,
    plot_critics_vs_best_actor,
    plot_imdb_vs_best_actor,
    ncol = 3,
    nrow = 1
  )
plot_combined_best_actor

#Audience score vs Best_actress_win
plot_audience_vs_best_actress <-
  ggplot(movies,
         aes(x = best_actress_win, y = audience_score, fill = best_actress_win)) +
  geom_boxplot() + theme(legend.position = "none") + labs(title = "Audience", x = "Best actress award", y = "Score")
#Critics score vs Best_dir_win
plot_critics_vs_best_actress <-
  ggplot(movies,
         aes(x = best_actress_win, y = critics_score, fill = best_actress_win)) +
  geom_boxplot() + theme(legend.position = "none") + labs(title = "Critics", x = "Best actress award", y = "Score")
#IMDB score vs Best_dir_win
plot_imdb_vs_best_actress <-
  ggplot(movies,
         aes(x = best_actress_win, y = imdb_rating, fill = best_actress_win)) +
  geom_boxplot() + theme(legend.position = "none") + labs(title = "IMDB", x = "Best actress award", y = "Score")
#Combine the plots...
plot_combined_best_actress <-
  ggarrange(
    plot_audience_vs_best_actress,
    plot_critics_vs_best_actress,
    plot_imdb_vs_best_actress,
    ncol = 3,
    nrow = 1
  )
plot_combined_best_actress

5. Do actors and directors have a higher score once they won? (find the first film and see the tendency, look for sudden change)

best_dirs =  subset(movies, movies$best_dir_win == "yes") %>%
  group_by(director) %>%
  filter(n() > 2)

win_years = data.frame(
  year = c(2010, 2007,1987, 1990, 1978),
  director=c("Kathryn Bigelow", "Martin Scorsese","Oliver Stone","Oliver Stone","Woody Allen")
)

win_year_intersects = 
  geom_vline(data = win_years, aes(xintercept=year, color = director), show.legend = FALSE)

p1 = ggplot(best_dirs, aes(y=critics_score, x=thtr_rel_year, color=director))+
  geom_point()+
  geom_line(show.legend = FALSE) +
  labs(title = "Oscar winning director movie scores", y = "Critics Score", x = "", color="Movie Released")+
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x=element_blank(),
        axis.ticks.x=element_blank())+
  win_year_intersects

p2 = ggplot(best_dirs, aes(y=audience_score, x=thtr_rel_year, color=director))+
  geom_point()+
  geom_line() +
  labs(y = "Audience Score", x = "")+
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())+
  win_year_intersects

p3= ggplot(best_dirs, aes(y=imdb_rating, x=thtr_rel_year, color=director))+
  geom_point()+
  geom_line() +
  labs(y = "IMDB Rating", x = "Film Release Year")+ 
  win_year_intersects

ggarrange(p1, p2, p3, nrow = 3, common.legend = TRUE, legend="bottom")